library(readxl)
library(dplyr)
library(tidyr)
library(stringr)

library(tidyverse)
library(DT)
library(patchwork)
library(ggthemes)
library(plotly)
library(gapminder)
library(grid)
library(ggplot2)
library(gridExtra)

library(maps)
library(usmap)

# install.packages("DescTools")
# install.packages("car")
# install.packages("rpart")
# install.packages("rpart.plot")

library(DescTools)
library(car)
library(rpart)
library(rpart.plot)

1 Title

Exploring the Impact of COVID-19 on State-Level Economic Indicators and housing price in the U.S

1.1 Introduction

This project is to explore how the COVID-19 pandemic affected economic conditions, focusing on housing price across U.S. states. Although the final goal is to analyze which factors contributed to post-COVID changes in housing prices, it also contains essential works: collecting, cleaning, visualizing and statistical analysis.

1.2 Data Sources

  • COVID

Includes two datasets: one providing nationwide annual COVID case totals and another offering state-by-state case counts from 2020 to 2023. These datasets help track the spread and intensity of the pandemic across different regions and time periods, offering important context for analyzing trends.

Source: Centers for Disease Control and Prevention (CDC) (https://covid.cdc.gov/COVID-DATA-TRACKER/#datatracker-home) & USA Facts (https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/)

File name: cases_deaths.csv & covid_confirmed_usafacts.csv

  • Zillow Home Value Index (ZHVI)

A measure of the typical home value and market changes across a given region and housing type. It reflects the typical value for homes in the 35th to 65th percentile range. Available as a smoothed, seasonally adjusted measure and as a raw measure.

Source: Zillow Research (https://www.zillow.com/research/data/)

File name: State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv

  • State unemployment rates over the last 10 years, seasonally adjusted

This dataset shows monthly unemployment rates by U.S. state. It is part of the Local Area Unemployment Statistics (LAUS) program by the Bureau of Labor Statistics (BLS). The data is seasonally adjusted and helps track labor market trends at the state level over time.

Source :U.S. Bureau of Labor Statistics (https://www.bls.gov/charts/state-employment-and-unemployment/state-unemployment-rates-animated.htm)

File name: unemployment rate.xlxs

  • Personal Consumption Expenditures (PCE)

Reports annual state-level PCE data, measuring average household spending across different regions.

Source: Bureau of Economic Analysis (BEA) (https://apps.bea.gov/regional/downloadzip.htm)

File name: SAPCE1__ALL_AREAS_1997_2023.csv

  • Annual Personal Income

Includes annual personal income per capita by state. Useful for assessing economic status and regional income disparities. Source: Bureau of Economic Analysis (BEA) File name: SAINC1__ALL_AREAS_1929_2024.csv → INCOME.csv

  • Gross Domestic Product (GDP)

Provides real GDP data by state from 1997 to 2024. This helps analyze economic output and growth across regions.

Source: Bureau of Economic Analysis (BEA)

File name: SAGDP1__ALL_AREAS_1997_2024.csv

  • State Level population and Household Debt Statistics

Includes per capita credit card debt and total household debt across states. Used to assess consumer financial health.

Sheet Title
population Number of Consumers in New York Fed Consumer Credit Panel
auto loan Auto Debt Balance per Capita
credit card Credit Card Debt Balance per Capita
mortgage Mortgage Debt Balance per Capita (excluding HELOC)
student loan Student Loan Debt Balance per Capita
total Total Debt Balance per Capita
auto loan delinquency Percent of Auto Debt Balance 90+ Days Delinquent
credit card delinquency Percent of Credit Card Debt Balance 90+ Days Delinquent
mortgage delinquency Percent of Mortgage Debt Balance 90+ Days Delinquent
student loan delinquency Percent of Student Loan Debt Balance 90+ Days Delinquent (and in default)

Source: New York Fed (https://www.newyorkfed.org/microeconomics/databank.html) File name: area_report_by_year.xlsx → NYFED.csv

2 DATA CLEANING

2.1 COVID Cases US

covid <- read_csv("data/cases_deaths.csv")
#View(covid)

covid_US <- covid %>% 
  filter(country == "United States") %>% 
  select(country, date, new_cases, total_cases, weekly_cases) %>%
  mutate(date = as.Date(date)) %>%
  mutate(year = year(date), month = month(date, label = TRUE, abbr = FALSE)) %>%
  group_by(year, month) %>%
  summarise(
    monthly_new_cases = sum(new_cases, na.rm = TRUE),
    latest_total_cases = max(total_cases, na.rm = TRUE),
    total_weekly_cases = sum(weekly_cases, na.rm = TRUE),
    .groups = "drop"
  )

covid_US
## # A tibble: 145 × 5
##     year month     monthly_new_cases latest_total_cases total_weekly_cases
##    <dbl> <ord>                 <dbl>              <dbl>              <dbl>
##  1     1 January              820465          103436829            4160575
##  2     1 February             701457          103436829            4418032
##  3     1 March                531503          103436829            4672369
##  4     1 April                616176          103436829            4930142
##  5     1 May                 1107849          103436829            5419154
##  6     1 June                1378542          103436829            5839205
##  7     1 July                 945231          103436829            6101223
##  8     1 August              1056186          103436829            6336944
##  9     1 September           1122716          103436829            6758203
## 10     1 October              740848          103436829            6967548
## # ℹ 135 more rows

2.2 COVID Cases per State

covidbystate <-read_csv("data/covid_confirmed_usafacts.csv")

covidStatesDaily <- covidbystate %>% 
  group_by(State) %>% 
  summarize(across(where(is.numeric), sum, na.rm = TRUE))
#covidStatesDaily

covidstates <- covidStatesDaily %>% 
  pivot_longer(
    cols = -State,
    names_to = "date",
    values_to = "cases"
  ) %>%
  filter(str_detect(date, "^\\d{4}-\\d{2}-\\d{2}$")) %>%
  mutate(
    date = ymd(date),
    year = year(date)
  ) %>%
  group_by(State, year) %>%
  summarise(total_cases = max(cases, na.rm = TRUE), .groups = "drop")

#covidstates
case_summary <- summary(covidstates$total_cases)
#case_summary

covidstates <- covidstates %>% 
  mutate(
    severity = case_when(
      total_cases <= case_summary[1] ~ "Very low",    
      total_cases <= case_summary[2] ~ "Low", 
      total_cases <= case_summary[3] ~ "Medium",    
      total_cases <= case_summary[5] ~ "High", 
      TRUE ~ "Very high"                              
    )
  )
covidstates <- covidstates %>%
  mutate(severity = factor(severity, levels = c("Very low", "Low", "Medium", "High", "Very high")))
covidstates
## # A tibble: 204 × 4
##    State  year total_cases severity
##    <chr> <dbl>       <dbl> <fct>   
##  1 AK     2020       46304 Low     
##  2 AK     2021      149907 Low     
##  3 AK     2022      277884 Low     
##  4 AK     2023      287319 Low     
##  5 AL     2020      361226 Medium  
##  6 AL     2021      896614 High    
##  7 AL     2022     1568934 High    
##  8 AL     2023     1659936 High    
##  9 AR     2020      225138 Low     
## 10 AR     2021      562455 Medium  
## # ℹ 194 more rows

2.3 ZHVI

house <- read_csv("data/State_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

house1 <- house %>%
  rename(State = RegionName) %>%
  select(-StateName, -RegionID, -SizeRank, -RegionType)
house_long <- house1 %>%
  pivot_longer(cols = -State, names_to = "Date", values_to = "ZHVI")
#house_long

house_long1 <- house_long %>%
  filter(str_sub(Date, 1, 4) %in% c("2017", "2018", "2019", "2020", "2021", "2022"))

#house_long1

house_long2 <- house_long1 %>%
  mutate(year = as.numeric(str_sub(Date, 1, 4)))

house_avg <- house_long2 %>%
  group_by(State, year) %>%
  summarise(mean_zhvi = mean(ZHVI, na.rm = TRUE), .groups = "drop")

#house_avg

house2 <- house_avg %>%
  pivot_wider(names_from = year, values_from = mean_zhvi)

#house2
house3 <- house2 %>%
  mutate(
    zhvi_pre_avg = rowMeans(select(., `2017`, `2018`, `2019`), na.rm = TRUE),
    zhvi_post_avg = rowMeans(select(., `2020`, `2021`, `2022`), na.rm = TRUE),
    zhvi_pre_change = ((`2019` - `2017`) / `2017`)*100,
    zhvi_post_change = ((`2022` - `2020`) / `2020`)*100
  ) %>%
  select(State, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change)

#house3
state_codes <- read.csv("https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv")

house4 <- house3 %>%
  left_join(state_codes, by = c("State" = "State"))

house_final <- house4  %>%
  select(Abbreviation, zhvi_pre_avg, zhvi_post_avg, zhvi_pre_change, zhvi_post_change) %>%
  rename(State = Abbreviation)

house_final 
## # A tibble: 51 × 5
##    State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change
##    <chr>        <dbl>         <dbl>           <dbl>            <dbl>
##  1 AL         148993.       192427.            9.76            28.0 
##  2 AK         292351.       335028.            7.66            12.3 
##  3 AZ         252522.       368418.           15.0             49.4 
##  4 AR         140966.       175352.            7.83            26.1 
##  5 CA         521016.       663241.           12.7             31.3 
##  6 CO         375678.       486205.           12.6             32.8 
##  7 CT         251600.       305414.            3.63            28.0 
##  8 DE         262137.       318745.            5.58            27.9 
##  9 DC         560797.       633072.            8.25             7.84
## 10 FL         228291.       308956.           13.2             45.6 
## # ℹ 41 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(house_final, "data/data-clean/HOUSING.csv", row.names = FALSE)

2.4 Unemployment Rates

unemploy <- read_excel("data/unemployment rate.xlsx", range = "A1:DR53")
names(unemploy)[-1] <- as.character(as.Date(as.numeric(names(unemploy)[-1]), origin = "1899-12-30"))

pre_df <- unemploy %>%
  select(State, `2017-12-01`, `2019-12-01`) %>%
  # Some is chr so we need to numeric
  mutate(across(-State, ~ as.numeric(.))) %>%
  mutate(unemp_pre = `2019-12-01` - `2017-12-01`) %>%
  rename(unemp_19 = `2019-12-01`, unemp_17=`2017-12-01`) %>% 
  select(State, unemp_pre, unemp_17 ,unemp_19)

post_df <- unemploy %>%
  select(State, `2020-12-01`, `2022-12-01`) %>%
  # Some is chr so we need to numeric
  mutate(across(-State, ~ as.numeric(.))) %>%
  mutate(unemp_post = `2022-12-01` - `2020-12-01`) %>%
  rename(unemp_22 = `2022-12-01`, unemp_20 = `2020-12-01`) %>% 
  select(State, unemp_post, unemp_20, unemp_22)

unemploy2 <- pre_df %>%
  inner_join(post_df, by = "State")

unemploy3 <- unemploy2 %>%
  left_join(state_codes, by = c("State" = "State"))

unemploy_final <- unemploy3 %>% 
  select(Abbreviation, unemp_pre,unemp_17, unemp_19, unemp_post,unemp_20, unemp_22) %>% 
  rename(State = Abbreviation)

unemploy_final
## # A tibble: 52 × 7
##    State unemp_pre unemp_17 unemp_19 unemp_post unemp_20 unemp_22
##    <chr>     <dbl>    <dbl>    <dbl>      <dbl>    <dbl>    <dbl>
##  1 AL       -0.9        4        3.1       -2.2      4.5      2.3
##  2 AK       -1.4        6.6      5.2       -3.1      7.1      4  
##  3 AZ       -0.200      4.9      4.7       -2.8      6.5      3.7
##  4 AR       -0.300      3.8      3.5       -2        5        3  
##  5 CA       -0.400      4.5      4.1       -4.6      9        4.4
##  6 CO       -0.100      2.9      2.8       -3.4      6.4      3  
##  7 CT       -0.600      4.3      3.7       -3.9      7.4      3.5
##  8 DE       -0.6        4.2      3.6       -1.6      5.6      4  
##  9 DC       -0.5        5.9      5.4       -3.1      7.8      4.7
## 10 FL       -1          4        3         -3.3      6.2      2.9
## # ℹ 42 more rows
# export to file but I will # because we already have sync folder with github
#write.csv(unemploy_final, "data/data-clean/UNEMPLOY.csv", row.names = FALSE)

2.5 PCE

pce <- read.csv("data/SAPCE1__ALL_AREAS_1997_2023.csv")

pce_df <- pce %>%
  filter(Description == "Personal consumption expenditures ") %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  filter(state != "United States") %>%
  mutate(
    pce_pre = ((X2019 - X2017) / X2017) * 100,
    pce_post = ((X2022 - X2020) / X2020) * 100
  )

pce_final <- pce_df %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, pce_pre, pce_post) %>%
  rename(State = Abbreviation)

head(pce_final)
##   State   pce_pre pce_post
## 1    AL  8.277837 22.66264
## 2    AK  5.572225 23.88433
## 3    AZ 10.681811 27.98044
## 4    AR  6.486729 22.09850
## 5    CA 10.787929 25.65983
## 6    CO 11.319759 27.96054
# export to file but I will # because we already have sync folder with github
#write.csv(pce_final, "data/data-clean/PCE.csv", row.names = FALSE)

2.6 Annual Personal Income

income <- read.csv("data/SAINC1__ALL_AREAS_1929_2024.csv")

income_df1 <- income %>%
  filter(str_trim(Description) == "Per capita personal income (dollars) 2/") %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  filter(state != "United States") %>%
  mutate(
    income_pre = ((X2019 - X2017) / X2017) * 100,
    income_post = ((X2022 - X2020) / X2020) * 100
  )

income_final <- income_df1 %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, income_pre, income_post) %>%
  rename(State = Abbreviation)

head(income_final)
##   State income_pre income_post
## 1    AL   7.581065   12.688960
## 2    AK   7.086726   11.401225
## 3    AZ  10.317882   13.134632
## 4    AR   5.637273   17.407643
## 5    CA  10.315388    9.528295
## 6    CO  13.109126   18.359079
# export to file but I will # because we already have sync folder with github
#write.csv(income_final, "data/data-clean/INCOME.csv", row.names = FALSE)

2.7 GDP

gdp_df <- read.csv("data/SAGDP1__ALL_AREAS_1997_2024.csv")

gdp_real <- gdp_df %>%
  filter(Description == "Real GDP (millions of chained 2017 dollars) 1/")

gdp_df2 <- gdp_real %>%
  select(GeoName, X2017, X2019, X2020, X2022) %>%
  rename(state = GeoName) %>%
  mutate(
    gdp_pre = ((X2019 - X2017) / X2017) * 100,
    gdp_post = ((X2022 - X2020) / X2020) * 100
  ) %>%
  filter(state != "United States")

gdp_final <- gdp_df2 %>%
  left_join(state_codes, by = c("state" = "State")) %>%
  select(Abbreviation, gdp_pre, gdp_post) %>%
  rename(State = Abbreviation)

head(gdp_final)
##   State   gdp_pre   gdp_post
## 1    AL  3.996621  7.3182724
## 2    AK -2.191186  0.8628568
## 3    AZ  7.948898 12.3828137
## 4    AR  2.694002  8.7421859
## 5    CA  8.358128  8.5462064
## 6    CO  9.570425 10.1607412
# export to file but I will # because we already have sync folder with github
#write.csv(gdp_final, "data/data-clean/GDP.csv", row.names = FALSE)

2.8 Population & Debt Statistics

file_path <- "data/area_report_by_year.xlsx"

# There are many sheets so we should target it
# But auto has diffefent format so we will seperate it
target_sheets <- c(
"population",
"creditcard",
"mortgage",
"student loan",
"total",
"auto_delinq",
"creditcard_delinq",
"mortgage_delinq",
"studentloan_delinq"
)

# auto has different format so seperated
auto <- read_excel(file_path, sheet = "auto", skip = 4, col_names = TRUE)
#auto
for (sheet_name in target_sheets) {
  
  df_name <- gsub(" ", "_", sheet_name)
  
  # The real data start from 9 row in the xlxs file. 
  df <- read_excel(file_path, sheet = sheet_name, skip = 8, col_names = TRUE)
  
  assign(df_name, df)
  #print(df_name)
}

# After load the file, it's nice to check but I will make it # to make the document short.
#population
#auto
#creditcard
#mortgage
#student_loan
#total
#auto_delinq
#creditcard_delinq
#mortgage_delinq
#studentloan_delinq
#population
population_clean <- population %>%
  filter(!is.na(state)) %>%
  mutate(
    population_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    population_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, population_pre, population_post)

# population_clean
creditcard_clean <- creditcard %>%
  filter(!is.na(state)) %>%
  mutate(
    creditcard_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    creditcard_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, creditcard_pre, creditcard_post)

# creditcard_clean
auto_clean <- auto %>%
  filter(!is.na(state)) %>%
  mutate(
    auto_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    auto_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, auto_pre, auto_post)

# auto_clean
mortgage_clean <- mortgage %>%
  filter(!is.na(state)) %>%
  mutate(
    mortgage_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    mortgage_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, mortgage_pre, mortgage_post)

# mortgage_clean
studentloan_clean <- student_loan %>%
  filter(!is.na(state)) %>%
  mutate(
    studentloan_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    studentloan_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, studentloan_pre, studentloan_post)

# studentloan_clean
total_clean <- total %>%
  filter(!is.na(state)) %>%
  mutate(
    total_pre = ((Q4_2019 - Q4_2017) / Q4_2017) * 100,
    total_post = ((Q4_2022 - Q4_2020) / Q4_2020) * 100
  ) %>%
  select(state, total_pre, total_post)

# total_clean 
creditcard_delinq_clean <- creditcard_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    creditcard_delinq_pre = Q4_2019,
    creditcard_delinq_post = Q4_2022
  )

# creditcard_delinq_clean
auto_delinq_clean <- auto_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    auto_delinq_pre = Q4_2019,
    auto_delinq_post = Q4_2022
  )

# auto_delinq_clean
mortgage_delinq_clean <- mortgage_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    mortgage_delinq_pre = Q4_2019,
    mortgage_delinq_post = Q4_2022
  )

# mortgage_delinq_clean
studentloan_delinq_clean <- studentloan_delinq %>%
  filter(!is.na(state)) %>%
  select(state, Q4_2019, Q4_2022) %>%
  rename(
    studentloan_delinq_pre = Q4_2019,
    studentloan_delinq_post = Q4_2022
  )

# studentloan_delinq_clean
NYFED_df <- population_clean %>%
  left_join(creditcard_clean, by = "state") %>%
  left_join(auto_clean, by = "state") %>%
  left_join(mortgage_clean, by = "state") %>%
  left_join(studentloan_clean, by = "state") %>%
  left_join(total_clean, by = "state") %>%
  left_join(creditcard_delinq_clean, by = "state") %>%
  left_join(auto_delinq_clean, by = "state") %>%
  left_join(mortgage_delinq_clean, by = "state") %>%
  left_join(studentloan_delinq_clean, by = "state")

head(NYFED_df)
## # A tibble: 6 × 21
##   state population_pre population_post creditcard_pre creditcard_post auto_pre
##   <chr>          <dbl>           <dbl>          <dbl>           <dbl>    <dbl>
## 1 AK            -0.444           -1.15           3.98            13.6   -0.998
## 2 AL             1.58             1.22           7.59            15.5    7.74 
## 3 AR             1.53             2.31          10               16.1    5.34 
## 4 AZ             4.47             4.48          10.2             16.5    9.41 
## 5 CA             1.57             1.87          11.4             19.8    7.53 
## 6 CO             3.31             3.61           8.22            17.6    4.75 
## # ℹ 15 more variables: auto_post <dbl>, mortgage_pre <dbl>,
## #   mortgage_post <dbl>, studentloan_pre <dbl>, studentloan_post <dbl>,
## #   total_pre <dbl>, total_post <dbl>, creditcard_delinq_pre <dbl>,
## #   creditcard_delinq_post <dbl>, auto_delinq_pre <dbl>,
## #   auto_delinq_post <dbl>, mortgage_delinq_pre <dbl>,
## #   mortgage_delinq_post <dbl>, studentloan_delinq_pre <dbl>,
## #   studentloan_delinq_post <dbl>
# Export final result if needed
#write.csv(NYFED_df, "data/data-clean/NYFED.csv", row.names = FALSE)

2.9 Final Data Join

clean_gdp <- read_csv("data/data-clean/GDP.csv")
clean_income <- read_csv("data/data-clean/INCOME.csv")
clean_nyfed <- read_csv("data/data-clean/NYFED.csv")
clean_pce <- read_csv("data/data-clean/PCE.csv")
clean_unemploy <- read_csv("data/data-clean/UNEMPLOY.csv")
clean_housing<- read_csv("data/data-clean/HOUSING.csv")

#clean_gdp 
#clean_income
#clean_nyfed 
#clean_pce 
#clean_unemploy 

clean_nyfed  <- clean_nyfed  %>%
  rename(State = state)

# Find common states
common_states <- Reduce(intersect, list(
  clean_gdp$State,
  clean_income$State,
  clean_pce$State,
  clean_nyfed$State,
  clean_unemploy$State,
  clean_housing$State
))

# Apply only common state
clean_gdp <- clean_gdp %>% filter(State %in% common_states)
clean_income <- clean_income %>% filter(State %in% common_states)
clean_pce <- clean_pce %>% filter(State %in% common_states)
clean_nyfed <- clean_nyfed %>% filter(State %in% common_states)
clean_unemploy <- clean_unemploy %>% filter(State %in% common_states)
clean_housing <- clean_housing %>% filter(State %in% common_states)

# Merge to one data set
merged_df <- clean_housing %>%
  left_join(clean_gdp, by = "State") %>%
  left_join(clean_income, by = "State") %>%
  left_join(clean_pce, by = "State") %>%
  left_join(clean_nyfed, by = "State") %>%
  left_join(clean_unemploy, by = "State")

merged_df
## # A tibble: 51 × 37
##    State zhvi_pre_avg zhvi_post_avg zhvi_pre_change zhvi_post_change gdp_pre
##    <chr>        <dbl>         <dbl>           <dbl>            <dbl>   <dbl>
##  1 AL         148993.       192427.            9.76            28.0    4.00 
##  2 AK         292351.       335028.            7.66            12.3   -2.19 
##  3 AZ         252522.       368418.           15.0             49.4    7.95 
##  4 AR         140966.       175352.            7.83            26.1    2.69 
##  5 CA         521016.       663241.           12.7             31.3    8.36 
##  6 CO         375678.       486205.           12.6             32.8    9.57 
##  7 CT         251600.       305414.            3.63            28.0    0.493
##  8 DE         262137.       318745.            5.58            27.9    7.29 
##  9 DC         560797.       633072.            8.25             7.84   3.43 
## 10 FL         228291.       308956.           13.2             45.6    6.90 
## # ℹ 41 more rows
## # ℹ 31 more variables: gdp_post <dbl>, income_pre <dbl>, income_post <dbl>,
## #   pce_pre <dbl>, pce_post <dbl>, population_pre <dbl>, population_post <dbl>,
## #   creditcard_pre <dbl>, creditcard_post <dbl>, auto_pre <dbl>,
## #   auto_post <dbl>, mortgage_pre <dbl>, mortgage_post <dbl>,
## #   studentloan_pre <dbl>, studentloan_post <dbl>, total_pre <dbl>,
## #   total_post <dbl>, creditcard_delinq_pre <dbl>, …
# export to file but I will # because we already have sync folder with github
# write.csv(merged_df, "data/data-clean/FINAL.csv", row.names = FALSE)

3 COVID Mapping Visualization (EDA)

df<- read_csv("data/data-clean/FINAL.csv")

#df

3.1 COVID distribution

3.1.1 Severity

#ggplot(covidstates, aes(x = severity, fill = severity)) +
#  geom_bar() +
#  labs(title = "Number of States by COVID Severity Level",
#       x = "Severity Level", y = "Number of States") +
#  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = #"red", "Very high" = "brown")) +
#  theme_minimal()

ggplot(covidstates, aes(x = severity, y = total_cases, fill = severity)) +
  geom_boxplot() +
  labs(title = "Distribution of Total Cases by Severity Level",
       x = "Severity", y = "Total Cases") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) + theme_minimal()

3.1.2 By year (2020-2023)

covidstates %>%
  ggplot(aes(x = year, fill = severity)) +
  geom_bar(position = "fill") +
  labs(title = "Distribution of COVID Severity by Year",
       y = "Proportion of States", x = "Year") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high" = "brown")) +
  theme_minimal()

3.2 6 States

covidstates %>%
  filter(State %in% c("CA", "TX", "NY", "IL", "AZ", "FL")) %>%
  ggplot(aes(x = year, y = total_cases, color = State)) +
  geom_line(size = 1.2) +
  labs(title = "COVID Cases Over Time",
       x = "Year", y = "Total Cases") +
  theme_minimal()

3.3 COVID US Map

state_abbreviations <- data.frame(
  abbreviation = state.abb, 
  full_name = state.name  
)


covidstatesFULLNAME <- covidstates %>%
  left_join(state_abbreviations, by = c("State" = "abbreviation")) %>%
  mutate(State = tolower(full_name))%>%
  select(-full_name)

# map visualization code
us_states <- map_data("state")
us_states$region <- tolower(us_states$region) 

# Merge the map data with the COVID data
covid_map_data <- us_states %>%
  left_join(covidstatesFULLNAME, by = c("region" = "State")) %>%
  filter(!is.na(severity))


ggplot(covid_map_data, aes(x = long, y = lat, group = group, fill = severity)) +
  geom_polygon(color = "black") +
  coord_fixed(1.1) +
  labs(title = "COVID Case Severity by State",
       fill = "Severity") +
  scale_fill_manual(values = c("Very low" = "lightgreen", "Low" = "yellow", "Medium" = "orange", "High" = "red", "Very high"= "brown")) +
  theme_void() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# covidstatesFULLNAME
# covidstates

4 Visuals of Economic Indicators (EDA)

4.1 Population

4.1.1 Formula

Variable Name Formula or Definition Description
population_pre ((Population_2019 - Population_2017) / Population_2017) × 100 Population growth rate (Pre-COVID)
population_post ((Population_2022 - Population_2020) / Population_2020) × 100 Population growth rate (Post-COVID)

4.1.2 Pre (2017–2019)

merged_dfLC <- merged_df %>%
  rename(state = State)

## PRE covid populations
plot_usmap(data = merged_dfLC, values = "population_pre", regions = "state") +
  scale_fill_continuous(
    name = "Population",
    low = "lightgreen", high = "darkgreen", label = scales::comma
  ) +
  labs(title = "Pre-COVID Population (2017–2019)") +
  theme(legend.position = "right")

4.1.3 Post (2020–2022)

## POST covid populations
plot_usmap(data = merged_dfLC, values = "population_post", regions = "state") +
  scale_fill_continuous(
    name = "Population",
    low = "lightblue", high = "darkblue", label = scales::comma
  ) +
  labs(title = "Post-COVID Population (2020–2022)") +
  theme(legend.position = "right")

4.1.4 Comparison

###  pre and post side by side
population_long <- merged_dfLC %>%
  select(state, population_pre, population_post) %>%
  pivot_longer(
    cols = c(population_pre, population_post),
    names_to = "period",
    values_to = "population"
  ) %>%
  mutate(
    period = dplyr::recode(period,
                    "population_pre" = "Pre-COVID (2017–2019)",
                    "population_post" = "Post-COVID (2020–2022)"),
    period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)"))
  )

plot_usmap(data = population_long, values = "population", regions = "state") +
  facet_wrap(~ period) +
  scale_fill_continuous(
    name = "Population",
    low = "lightgreen", high = "darkgreen", label = scales::comma
  ) +
  labs(title = "Population: Pre vs. Post COVID") +
  theme(legend.position = "right")

4.2 GDP

4.2.1 Formula

Variable Name Formula or Definition Description
gdp_pre ((GDP_2019 - GDP_2017) / GDP_2017) × 100 GDP growth rate (Pre-COVID)
gdp_post ((GDP_2022 - GDP_2020) / GDP_2020) × 100 GDP growth rate (Post-COVID)

4.2.2 Visual

# compare GDP
df %>% 
  select(State, gdp_pre, gdp_post) %>% 
  ggplot()+
  geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_pre
                         , fill = "GDP pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4)+
  geom_bar(mapping = aes(x=reorder(State, -gdp_post), y=gdp_post, fill ="GDP post = ((X2022 - X2020) / X2020) * 100")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4)+
  labs(title = "Real GDP Growth: Pre vs Post COVID", y = "GDP Change (%)", x = "")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.3 Income

4.3.1 Formula

Variable Name Formula or Definition Description
income_pre ((Income_2019 - Income_2017) / Income_2017) × 100 Personal income growth (Pre-COVID)
income_post ((Income_2022 - Income_2020) / Income_2020) × 100 Personal income growth (Post-COVID)

4.3.2 Visual

df %>%
  select(State, income_pre, income_post) %>%
  ggplot() +
  geom_bar(mapping = aes(x = reorder(State, -income_post)
                         , y = income_pre
                         , fill = "Income pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(mapping = aes(x = reorder(State, -income_post)
                         , y = income_post
                         , fill = "Income post = ((X2022 - X2020) / X2020) * 100")
           ,stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Personal Income Growth: Pre vs Post COVID", y = "Income Change (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.4 PCE

4.4.1 Formula

Variable Name Formula or Definition Description
pce_pre ((PCE_2019 - PCE_2017) / PCE_2017) × 100 PCE growth rate (Pre-COVID)
pce_post ((PCE_2022 - PCE_2020) / PCE_2020) × 100 PCE growth rate (Post-COVID)

4.4.2 Visual

df %>%
  select(State, pce_pre, pce_post) %>%
  ggplot() +
  geom_bar(mapping = aes(x = reorder(State, -pce_post)
                         , y = pce_pre
                         , fill = "PCE pre = ((X2019 - X2017) / X2017) * 100")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(mapping = aes(x = reorder(State, -pce_post)
                         , y = pce_post
                         , fill = "PCE post = ((X2022 - X2020) / X2020) * 100")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Personal Consumption Expenditures Growth: Pre vs Post COVID",
       y = "PCE Change (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")

4.5 Unemployment Rate

4.5.1 Formula

Variable Name Formula or Definition Description
unemp_pre Unemp_2019 - Unemp_2017 Change in unemployment rate (Pre-COVID)
unemp_post Unemp_2022 - Unemp_2020 Change in unemployment rate (Post-COVID)

4.5.2 Visual

df %>%
  select(State, unemp_pre, unemp_post) %>%
  ggplot() +
  geom_bar(aes(x = reorder(State, -unemp_post)
               , y = unemp_pre, 
               fill = "Unemp pre = 2019 - 2017")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(aes(x = reorder(State, -unemp_post)
               , y = unemp_post
               , fill = "Unemp post = (2022 - 2020)")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Unemployment Rate Change: 2017–2019 vs 2020–2022", y = "Change in Unemployment Rate", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")

df_emp <- df %>%
  select(State, unemp_17, unemp_19, unemp_20, unemp_22)

#df_emp

df_long <- df_emp %>% 
  pivot_longer(cols = starts_with("unemp_"), names_to = "Year", values_to = "Unemployment") %>%
  mutate(Year = dplyr::recode(Year,
                       "unemp_17" = "2017",
                       "unemp_19" = "2019",
                       "unemp_20" = "2020",
                       "unemp_22" = "2022"))

#df_long 

ggplot(df_long, aes(x = Year, y = Unemployment, group = State, color = State)) +
  geom_line(size = 1) +
  labs(title = "Unemployment Rate by State", y = "Unemployment Rate (%)", x = "Year") +
  theme(legend.position = "none")

5 Debt & Delinquency Visuals

5.1 Credit Card

5.1.1 Fomula

Variable Name Formula or Definition Description
creditcard_pre ((Q4_2019 - Q4_2017) / Q4_2017) × 100 Credit card debt growth rate (Pre-COVID)
creditcard_post ((Q4_2022 - Q4_2020) / Q4_2020) × 100 Credit card debt growth rate (Post-COVID)
creditcard_delinq_pre Q4_2019 Credit card delinquency rate (Pre-COVID)
creditcard_delinq_post Q4_2022 Credit card delinquency rate (Post-COVID)

5.1.2 Visual 1

credit_df <- df %>% 
  select(State, creditcard_pre, creditcard_post) %>% 
  pivot_longer(cols = c(creditcard_pre, creditcard_post),
               names_to = "Time", values_to = "Change") %>% 
  mutate(Time = factor(Time, levels = c("creditcard_pre", "creditcard_post")))

## credit_df

ggplot(credit_df, aes(x = State, y = Change, fill = Time)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.9) +  
  labs(title = "Change Rate in Credit Card Dept",
       y = "Credit Card Dept Change Rate (%)", x = "")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
        legend.position = "bottom")

5.1.3 Visual 2

credit_delinq_df <- df %>% 
  select(State, creditcard_delinq_pre, creditcard_delinq_post) %>% 
  pivot_longer(cols = c(creditcard_delinq_pre, creditcard_delinq_post),
               names_to = "Time", values_to = "Change") %>% 
  mutate(Time = factor(Time, levels = c("creditcard_delinq_pre", "creditcard_delinq_post")))

#credit_delinq_df

gg_debt <- ggplot(credit_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  labs(title = "Credit Card Debt Change (%)", y = "Change Rate (%)", x = "")+
  theme(legend.position = "bottom")

#gg_debt 

gg_delinq <- ggplot(credit_delinq_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.5) +
  labs(title = "Credit Card Delinquency Rate (%)", y = "Delinquency Rate (%)", x = "")+
  theme(legend.position = "bottom")

#gg_delinq

grid.arrange(gg_debt, gg_delinq, ncol = 2)

5.2 Mortgage

5.2.1 Formula

Variable Name Formula or Definition Description
mortgage_pre ((Q4_2019 - Q4_2017) / Q4_2017) × 100 Mortgage debt growth rate (Pre-COVID)
mortgage_post ((Q4_2022 - Q4_2020) / Q4_2020) × 100 Mortgage debt growth rate (Post-COVID)
mortgage_delinq_pre Q4_2019 Mortgage delinquency rate (Pre-COVID)
mortgage_delinq_post Q4_2022 Mortgage delinquency rate (Post-COVID)

5.2.2 Visual

#Percent Change in Mortgage Debt Balance per Capita (2017 → 2019)
qq1 <- ggplot(df, aes(x = "", y = mortgage_pre)) +
  geom_boxplot(fill = "pink") +
  labs(title = "Mortgage Debt per Capital -Pre", y = "Mortgage Dept Change Rate (%)")

# Percent Change in Mortgage Debt Balance per Capita (2020 → 2022)
qq2 <- ggplot(df, aes(x = "", y = mortgage_post)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Mortgage Debt per Capital - Post", y = "Mortgage Dept Change Rate (%)")

qq3 <- ggplot(df, aes(x = "", y = mortgage_delinq_pre)) +
  geom_boxplot(fill = "orange") +
  labs(title = "Mortgage Delinquency - Pre", y = "Delinquency Rate (%)")

qq4 <- ggplot(df, aes(x = "", y = mortgage_delinq_post)) +
  geom_boxplot(fill = "purple") +
  labs(title = "Mortgage Delinquency - Post", y = "Delinquency Rate (%)")

grid.arrange(qq1, qq2, qq3, qq4, ncol = 4)

5.3 Student Loan

5.3.1 Formula

Variable Name Formula or Definition Description
studentloan_pre ((Q4_2019 - Q4_2017) / Q4_2017) × 100 Student loan debt growth rate (Pre-COVID)
studentloan_post ((Q4_2022 - Q4_2020) / Q4_2020) × 100 Student loan debt growth rate (Post-COVID)
studentloan_delinq_pre Q4_2019 Student loan delinquency rate (Pre-COVID)
studentloan_delinq_post Q4_2022 Student loan delinquency rate (Post-COVID)

5.3.2 Visual

df %>%
  select(State, studentloan_delinq_pre, studentloan_delinq_post) %>%
  ggplot() +
  geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
               , y = studentloan_delinq_pre
               ,fill = "Student Loan pre = Q4_2019")
           , stat = "identity"
           , position = position_nudge(x = -0.3)
           , width = 0.4) +
  geom_bar(aes(x = reorder(State, -studentloan_delinq_post)
               , y = studentloan_delinq_post
               , fill = "Student Loan post = Q4_2022")
           , stat = "identity"
           , position = position_nudge(x = 0.3)
           , width = 0.4) +
  labs(title = "Student Loan Delinquency: Pre vs Post COVID", y = "Delinquency Rate (%)", x = "") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),legend.position = "bottom")

student_df <- df %>% 
  select(State, studentloan_pre, studentloan_post) %>% 
  pivot_longer(cols = c(studentloan_pre, studentloan_post), names_to = "Time", values_to = "Change") %>%
  mutate(Time= ifelse(Time == "studentloan_pre", "Pre", "Post"))

#student_df

student_df$Time <- factor(student_df$Time, levels = c("Pre", "Post"))

gg_student <- ggplot(student_df, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.4) +
  labs(title = "Student Loan Debt Change (%)", y = "Change Rate (%)", x = "") +
  scale_fill_manual(values = c( "Pre" = "orange","Post" = "purple"))

#gg_student

#names(df)

student_df2 <- df %>% 
  select(State, studentloan_delinq_pre, studentloan_delinq_post) %>% 
  pivot_longer(cols = c(studentloan_delinq_pre, studentloan_delinq_post), names_to = "Time", values_to = "Change") %>%
  mutate(Time= ifelse(Time == "studentloan_delinq_pre", "Pre", "Post"))

#student_df2

student_df2$Time <- factor(student_df2$Time, levels = c("Pre", "Post"))

gg_student2 <- ggplot(student_df2, aes(x = Time, y = Change, fill = Time)) +
  geom_violin(trim = FALSE, alpha = 0.4) +
  labs(title = "Student Loan Debt Deliq Change (%)", y = "Change Rate (%)", x = "") +
  scale_fill_manual(values = c( "Pre" = "yellow", "Post" = "red"))

#gg_student2

grid.arrange(gg_student, gg_student2, ncol = 2)

5.4 Auto

5.4.1 Formula

Variable Name Formula Description
auto_pre ((Q4_2019 - Q4_2017) / Q4_2017) × 100 Auto loan debt growth rate (Pre-COVID)
auto_post ((Q4_2022 - Q4_2020) / Q4_2020) × 100 Auto loan debt growth rate (Post-COVID)
auto_delinq_pre Q4_2019 Auto loan delinquency rate (Pre-COVID)
auto_delinq_post Q4_2022 Auto loan delinquency rate (Post-COVID)

5.4.2 Visual

df_auto <- df %>%
  select(auto_pre, auto_post, auto_delinq_pre, auto_delinq_post) %>%
  pivot_longer(cols = everything(), names_to = "Category", values_to = "Value") %>% 
  mutate(Category = factor(Category, levels = c("auto_pre", "auto_post", "auto_delinq_pre", "auto_delinq_post")))


ggplot(df_auto, aes(x = Category, y = Value, fill = Category)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplots of Auto Loan and Delinquency (Pre vs Post)",
       x = NULL,
       y = "Value") +
  theme(axis.text.x = element_text(angle = 20, hjust = 1), legend.position = "bottom")

6 Housing Change Rate Visualizations

6.1 Formula

Variable Name Formula Description
zhvi_post_change ((ZHVI_2022 - ZHVI_2020) / ZHVI_2020) × 100 Housing price change rate (Post-COVID)
zhvi_post_avg (ZHVI_2020 + ZHVI_2021 + ZHVI_2022) / 3 Average housing price during post-COVID years
zhvi_pre_change ((ZHVI_2019 - ZHVI_2017) / ZHVI_2017) × 100 Housing price change rate (Pre-COVID)
zhvi_pre_avg (ZHVI_2017 + ZHVI_2018 + ZHVI_2019) / 3 Average housing price during pre-COVID years

6.2 Pre (2017–2019)

merged_dfLC <- merged_df %>%
  rename(state = State)

## heat map for PRE covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_pre_change", regions = "state") +
  scale_fill_continuous(
    name = "Zillow Home Value Change (%)",
    low = "lightyellow", high = "red", label = scales::comma
  ) +
  labs(title = "Pre-COVID Housing Price Change (2017–2019)") +
  theme(legend.position = "right")

6.3 Post (2020–2022)

## heat map for POST covid housing change rate
plot_usmap(data = merged_dfLC, values = "zhvi_post_change", regions = "state") +
  scale_fill_continuous(
    name = "Zillow Home Value Change (%)",
    low = "lightblue", high = "darkblue", label = scales::comma
  ) +
  labs(title = "Post-COVID Housing Price Change (2020–2022)") +
  theme(legend.position = "right")

6.4 Comparison

### both pre and post, side by side

housing_long <- merged_dfLC %>%
  select(state, zhvi_pre_change, zhvi_post_change) %>%
  pivot_longer(
    cols = c(zhvi_pre_change, zhvi_post_change),
    names_to = "period",
    values_to = "change"
  ) %>%
  mutate(
    period = dplyr::recode(period,
                    "zhvi_pre_change" = "Pre-COVID (2017–2019)",
                    "zhvi_post_change" = "Post-COVID (2020–2022)")) %>% 
  mutate(period = factor(period, levels = c("Pre-COVID (2017–2019)", "Post-COVID (2020–2022)")))



plot_usmap(data = housing_long, values = "change", regions = "state") +
  facet_wrap(~ period) +
  scale_fill_continuous(
    name = "ZHVI Change (%)",
    low = "lightyellow", high = "red", label = scales::comma
  ) +
  labs(title = "Housing Price Change: Pre vs. Post COVID") +
  theme(legend.position = "right")

7 Statistic

7.1 Paired t-test

7.1.1 ZHVI

housing_summary <- clean_housing %>%
  select(State, zhvi_pre_avg, zhvi_post_avg)

# Run paired t-test 
t_test_result <- t.test(housing_summary$zhvi_post_avg, housing_summary$zhvi_pre_avg, 
                        paired = TRUE, alternative = "two.sided")

t_test_result
## 
##  Paired t-test
## 
## data:  housing_summary$zhvi_post_avg and housing_summary$zhvi_pre_avg
## t = 14.386, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  57966.91 76779.44
## sample estimates:
## mean difference 
##        67373.18

7.1.2 Mortgage

mortgage_summary <- merged_df %>%
  select(State, mortgage_pre, mortgage_post)

# Run paired t-test 
t_test_result2 <- t.test(merged_df$mortgage_post,merged_df$mortgage_pre, 
                        paired = TRUE, alternative = "two.sided")

t_test_result2
## 
##  Paired t-test
## 
## data:  merged_df$mortgage_post and merged_df$mortgage_pre
## t = 25.871, df = 50, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##   8.822871 10.308125
## sample estimates:
## mean difference 
##        9.565498

7.1.3 Income

income_summary <- merged_df %>%
  select(State, income_pre, income_post)

# Run paired t-test 
t_test_result3 <- t.test(merged_df$income_post,merged_df$income_pre, 
                        paired = TRUE, alternative = "two.sided")

t_test_result3
## 
##  Paired t-test
## 
## data:  merged_df$income_post and merged_df$income_pre
## t = 10.794, df = 50, p-value = 1.156e-14
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  3.336458 4.862107
## sample estimates:
## mean difference 
##        4.099282

7.2 Correlation

7.2.1 ZHVI

# Correlation between housing prices pre and post
cor_zhvi <- cor.test(merged_df$zhvi_pre_avg, merged_df$zhvi_post_avg, method = "pearson")
cor_zhvi
## 
##  Pearson's product-moment correlation
## 
## data:  merged_df$zhvi_pre_avg and merged_df$zhvi_post_avg
## t = 36.948, df = 49, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9694276 0.9900366
## sample estimates:
##       cor 
## 0.9825222
ggplot(merged_df, aes(x = zhvi_pre_avg, y = zhvi_post_avg)) +
  geom_point(color = "blue") + 
  geom_smooth(method = "lm", color = "red", se = TRUE) + 
  labs(
    title = "Correlation Between Pre and Post COVID Housing Prices",
    subtitle = "Strong positive correlation between ZHVI pre- and post-pandemic",
    x = "Pre-COVID Average Housing Prices (ZHVI)",
    y = "Post-COVID Average Housing Prices (ZHVI)"
  ) +
  theme_minimal()

7.2.2 Mortgage

cor_mort <- cor.test(merged_df$mortgage_pre, merged_df$mortgage_post, method = "pearson")
cor_mort
## 
##  Pearson's product-moment correlation
## 
## data:  merged_df$mortgage_pre and merged_df$mortgage_post
## t = 5.7317, df = 49, p-value = 6.045e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4336644 0.7739861
## sample estimates:
##       cor 
## 0.6335339
ggplot(merged_df, aes(x = mortgage_pre, y = mortgage_post)) +
  geom_point(color = "blue") + 
  geom_smooth(method = "lm", color = "red", se = TRUE) + 
  labs(
    title = "Correlation Between Pre and Post COVID Mortgage",
    x = "Pre-COVID Average Mortgage",
    y = "Post-COVID Average Mortgage"
  ) +
  theme_minimal()

7.2.3 Income

cor_income <- cor.test(merged_df$income_pre, merged_df$income_post, method = "pearson")
cor_income
## 
##  Pearson's product-moment correlation
## 
## data:  merged_df$income_pre and merged_df$income_post
## t = 2.3517, df = 49, p-value = 0.02275
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04700942 0.54612103
## sample estimates:
##       cor 
## 0.3184673
ggplot(merged_df, aes(x = income_pre, y = income_post)) +
  geom_point(color = "blue") + 
  geom_smooth(method = "lm", color = "red", se = TRUE) + 
  labs(
    title = "Correlation Between Pre and Post COVID Income",
    x = "Pre-COVID Average Income",
    y = "Post-COVID Average Income"
  ) +
  theme_minimal()

7.3 Multiple Linear Regression

7.3.1 ZHVI model 1

# Multiple linear regression model
model_multiple <- lm(zhvi_post_avg ~ zhvi_pre_avg + gdp_post + income_post + population_post, data = merged_df)

# View summary
summary(model_multiple)
## 
## Call:
## lm(formula = zhvi_post_avg ~ zhvi_pre_avg + gdp_post + income_post + 
##     population_post, data = merged_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31408  -9558  -1851  10153  52293 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -1.288e+04  1.610e+04  -0.800  0.42755    
## zhvi_pre_avg     1.201e+00  2.304e-02  52.111  < 2e-16 ***
## gdp_post         2.692e+03  1.073e+03   2.508  0.01575 *  
## income_post     -9.186e+02  1.073e+03  -0.856  0.39625    
## population_post  9.310e+03  2.779e+03   3.350  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16620 on 46 degrees of freedom
## Multiple R-squared:  0.9854, Adjusted R-squared:  0.9841 
## F-statistic: 774.8 on 4 and 46 DF,  p-value: < 2.2e-16

7.3.2 ZHVI model 2

# Multiple linear regression model
model_multiple <- lm(zhvi_post_avg ~ zhvi_pre_avg + gdp_post + population_post, data = merged_df)

# View summary
summary(model_multiple)
## 
## Call:
## lm(formula = zhvi_post_avg ~ zhvi_pre_avg + gdp_post + population_post, 
##     data = merged_df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32313 -10163   -790  11831  52531 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.489e+04  7.881e+03  -3.158  0.00277 ** 
## zhvi_pre_avg     1.205e+00  2.243e-02  53.734  < 2e-16 ***
## gdp_post         2.975e+03  1.018e+03   2.922  0.00533 ** 
## population_post  8.007e+03  2.319e+03   3.453  0.00118 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16580 on 47 degrees of freedom
## Multiple R-squared:  0.9851, Adjusted R-squared:  0.9842 
## F-statistic:  1039 on 3 and 47 DF,  p-value: < 2.2e-16
df_stat <- read_csv("data/data-clean/FINAL.csv")
# df_stat

7.4 ANOVA

Variable ANOVA p-value Post-Hoc Interpretation Shapiro p-value Levene p-value
gdp_post 2.40e-08 All three groups significantly different from each other 0.798 0.273
income_post 0.039 Low group < Medium & High groups (no difference between Medium & High) 0.299 0.344
unemp_post 0.122 No statistically significant differences between groups 0.679 0.605
total_post 0.00016 Differences exist; Post-hoc details required (likely Low vs High) 0.401 0.138
creditcard_post 0.333 No statistically significant differences between groups 0.873 0.188

7.4.1 Population

df_stat$pop_group <- cut(df_stat$population_post,
                    breaks = quantile(df_stat$population_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                    labels = c("Small", "Medium", "Large"),
                    include.lowest = TRUE)

pop_anova <- aov(zhvi_post_change ~ pop_group, data = df_stat)
summary(pop_anova)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## pop_group    2   2412  1206.2   25.39 3.01e-08 ***
## Residuals   48   2280    47.5                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(pop_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $pop_group
##                   diff     lwr.ci   upr.ci    pval    
## Medium-Small  4.031883 -0.7216142  8.78538  0.0946 .  
## Large-Small  16.181339 11.4278415 20.93484 1.3e-08 ***
## Large-Medium 12.149456  7.3959584 16.90295 5.0e-06 ***
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = pop_group, y = zhvi_post_change, fill = pop_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Post-COVID Housing Price Change by Population Group",
       x = "Population Group",
       y = "zhvi_post_change")

# residual analysis to check the assumptions
shapiro.test(pop_anova$residuals)  
## 
##  Shapiro-Wilk normality test
## 
## data:  pop_anova$residuals
## W = 0.96771, p-value = 0.1774
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(pop_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.0936 0.3432
##       48
pop_group_table <- df_stat %>%
  select(State, population_post, zhvi_post_change, pop_group)


datatable(pop_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by Population Group (Small / Medium / Large)")

7.4.2 Income

df_stat$income_group <- cut(df_stat$income_post,
                       breaks = quantile(df_stat$income_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                       labels = c("Low", "Medium", "High"),
                       include.lowest = TRUE)

income_anova <- aov(zhvi_post_change ~ income_group, data = df_stat)
summary(income_anova)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## income_group  2    593   296.7   3.474  0.039 *
## Residuals    48   4099    85.4                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(income_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $income_group
##                   diff     lwr.ci    upr.ci   pval    
## Medium-Low   7.6532966  1.2800167 14.026577 0.0196 *  
## High-Low     6.7293284  0.3560485 13.102608 0.0389 *  
## High-Medium -0.9239682 -7.2972481  5.449312 0.7719    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(df_stat, aes(x = income_group, y = zhvi_post_change, fill = income_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Housing Price Change by Income Group",
       x = "Income Group",
       y = "zhvi_post_change")

# residual analysis to check the assumptions
shapiro.test(income_anova$residuals)  
## 
##  Shapiro-Wilk normality test
## 
## data:  income_anova$residuals
## W = 0.98935, p-value = 0.9254
# Levene's test for homogeneity of variance
leveneTest(zhvi_post_change ~ factor(income_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.0908 0.3441
##       48
income_group_table <- df_stat %>%
  select(State, income_post, zhvi_post_change, income_group)

datatable(income_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by Income Group (Low / Medium / High)")

7.4.3 GDP

df_stat$gdp_group <- cut(df_stat$gdp_post,
                         breaks = quantile(df_stat$gdp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                         labels = c("Low", "Medium", "High"),
                         include.lowest = TRUE)

gdp_anova <- aov(zhvi_post_change ~ gdp_group, data = df_stat)
summary(gdp_anova)
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## gdp_group    2   2434  1216.9   25.86 2.4e-08 ***
## Residuals   48   2259    47.1                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(gdp_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $gdp_group
##                 diff    lwr.ci    upr.ci    pval    
## Medium-Low   5.17528  0.444203  9.906358  0.0327 *  
## High-Low    16.53958 11.808498 21.270653 6.6e-09 ***
## High-Medium 11.36430  6.633218 16.095372 1.4e-05 ***
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(gdp_anova$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  gdp_anova$residuals
## W = 0.98001, p-value = 0.5392
leveneTest(zhvi_post_change ~ factor(gdp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.3341  0.273
##       48
ggplot(df_stat, aes(x = gdp_group, y = zhvi_post_change, fill = gdp_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Housing Price Change by GDP Group",
       x = "GDP Group",
       y = "zhvi_post_change")

gdp_group_table <- df_stat %>%
  select(State, gdp_post, zhvi_post_change, gdp_group)

datatable(gdp_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by GDP Group (Low / Medium / High)")

7.4.4 Unemployment

df_stat$unemp_group <- cut(df_stat$unemp_post,
                           breaks = quantile(df_stat$unemp_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                           labels = c("Low", "Medium", "High"),
                           include.lowest = TRUE)

unemp_anova <- aov(zhvi_post_change ~ unemp_group, data = df_stat)
summary(unemp_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## unemp_group  2    394  197.17   2.202  0.122
## Residuals   48   4298   89.55
PostHocTest(unemp_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $unemp_group
##                 diff     lwr.ci    upr.ci   pval    
## Medium-Low  4.306174 -2.1287038 10.741051 0.1848    
## High-Low    6.813317  0.1859737 13.440661 0.0441 *  
## High-Medium 2.507144 -4.0303336  9.044621 0.4444    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(unemp_anova$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  unemp_anova$residuals
## W = 0.97346, p-value = 0.3061
leveneTest(zhvi_post_change ~ factor(unemp_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.5078  0.605
##       48
ggplot(df_stat, aes(x = unemp_group, y = zhvi_post_change, fill = unemp_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Housing Price Change by Unemployment Group",
       x = "Unemployment Group",
       y = "zhvi_post_change")

unemp_group_table <- df_stat %>%
  select(State, unemp_post, zhvi_post_change, unemp_group)

datatable(unemp_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by Unemployment Group (Low / Medium / High)")

7.4.5 Total Dept

df_stat$total_group <- cut(df_stat$total_post,
                           breaks = quantile(df_stat$total_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                           labels = c("Low", "Medium", "High"),
                           include.lowest = TRUE)

total_anova <- aov(zhvi_post_change ~ total_group, data = df_stat)
summary(total_anova)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## total_group  2   1435   717.6   10.57 0.000157 ***
## Residuals   48   3258    67.9                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
PostHocTest(total_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $total_group
##                   diff    lwr.ci    upr.ci    pval    
## Medium-Low  -0.1750897 -5.856411  5.506232 0.95085    
## High-Low    11.1646443  5.483323 16.845966 0.00025 ***
## High-Medium 11.3397340  5.658412 17.021056 0.00021 ***
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(total_anova$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  total_anova$residuals
## W = 0.94454, p-value = 0.01876
leveneTest(zhvi_post_change ~ factor(total_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  2.0672 0.1377
##       48
ggplot(df_stat, aes(x = total_group, y = zhvi_post_change, fill = total_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Housing Price Change by Total Debt Group",
       x = "Total Debt Group",
       y = "zhvi_post_change")

total_group_table <- df_stat %>%
  select(State, total_post, zhvi_post_change, total_group)

datatable(total_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by Total Debt Group (Low / Medium / High)")

7.4.6 Credit Card

df_stat$creditcard_group <- cut(df_stat$creditcard_post,
                                breaks = quantile(df_stat$creditcard_post, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                                labels = c("Low", "Medium", "High"),
                                include.lowest = TRUE)

creditcard_anova <- aov(zhvi_post_change ~ creditcard_group, data = df_stat)
summary(creditcard_anova)
##                  Df Sum Sq Mean Sq F value Pr(>F)
## creditcard_group  2    210  104.99   1.124  0.333
## Residuals        48   4483   93.39
PostHocTest(creditcard_anova, method = "lsd")
## 
##   Posthoc multiple comparisons of means : Fisher LSD 
##     95% family-wise confidence level
## 
## $creditcard_group
##                  diff    lwr.ci    upr.ci   pval    
## Medium-Low   4.930728 -1.733912 11.595367 0.1434    
## High-Low     3.007060 -3.657579  9.671699 0.3688    
## High-Medium -1.923668 -8.588307  4.740972 0.5644    
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
shapiro.test(creditcard_anova$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  creditcard_anova$residuals
## W = 0.98949, p-value = 0.9295
leveneTest(zhvi_post_change ~ factor(creditcard_group), data = df_stat)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.7293 0.1883
##       48
ggplot(df_stat, aes(x = creditcard_group, y = zhvi_post_change, fill = creditcard_group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Housing Price Change by Credit Card Debt Group",
       x = "Credit Card Debt Group",
       y = "zhvi_post_change")

credit_group_table <- df_stat %>%
  select(State, creditcard_post, zhvi_post_change, creditcard_group)

datatable(credit_group_table,
          options = list(pageLength = 10, autoWidth = TRUE),
          caption = "States by Credit Card Debt Group (Low / Medium / High)")

7.5 Decision Tree

tree_model <- rpart(zhvi_post_change ~ zhvi_post_avg + population_post + gdp_post + income_post + unemp_post + total_post + creditcard_post
                    ,data = df_stat
                    ,method = "anova"
                    ,control = rpart.control(cp = 0.0001,minsplit = 10, minbucket = 4)) # it's not class because our house price is 

rpart.plot(tree_model,
           extra = 101,      
           fallen.leaves = TRUE,
           main = "Decision Tree")

tree_model$variable.importance
##        gdp_post population_post      total_post creditcard_post     income_post 
##       2899.6427       2486.3110       1887.3291        836.0229        485.0208 
##   zhvi_post_avg      unemp_post 
##        309.8245        182.1748
node_assignments <- tree_model$where

tree_nodes <- data.frame(State = df_stat$State, Node = node_assignments)

#tree_nodes
datatable(tree_nodes, 
          filter = "top",
          caption = "state")

7.6 Clustering

7.6.1 6 vars

clusters <- c("population_post", "gdp_post", "income_post", "unemp_post", "total_post", "zhvi_post_avg")
cluster_z <- scale(df_stat[, clusters] )

set.seed(612)
kmeans <- kmeans(cluster_z, centers = 5)

#kmeans

df_stat$cluster <- as.factor(kmeans$cluster)

ggplot(df_stat, aes(x = cluster, y = zhvi_post_change, fill = cluster)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Cluster for Post COVID",
       x = "Cluster Group",
       y = "zhvi_post_change")

cluster_nodes <- data.frame(State = df_stat$State, Cluster = df_stat$cluster)

#cluster_nodes
split(cluster_nodes$State, cluster_nodes$Cluster)
## $`1`
## [1] "CA" "DC" "HI" "MA"
## 
## $`2`
##  [1] "AL" "GA" "IN" "KS" "KY" "ME" "MN" "MO" "NH" "OR" "SC" "VT" "VA" "WI"
## 
## $`3`
##  [1] "AK" "CT" "IL" "LA" "MD" "MI" "MS" "NJ" "NM" "NY" "OH" "PA" "RI" "WV"
## 
## $`4`
##  [1] "AZ" "CO" "FL" "ID" "MT" "NV" "NC" "TN" "TX" "UT" "WA"
## 
## $`5`
## [1] "AR" "DE" "IA" "NE" "ND" "OK" "SD" "WY"
cluster_summary <- aggregate(df_stat[, clusters], by = list(Cluster = df_stat$cluster), FUN = mean)
cluster_summary[ , -1] <- round(cluster_summary[ , -1], 2)
print(cluster_summary)
##   Cluster population_post gdp_post income_post unemp_post total_post
## 1       1            1.25     7.76       10.07      -4.65      13.62
## 2       2            2.24     8.05       12.35      -1.89      11.48
## 3       3            1.19     6.02       10.49      -3.55      11.12
## 4       4            4.13    11.76       14.58      -2.51      16.86
## 5       5            2.80     5.51       16.95      -1.64      11.73
##   zhvi_post_avg
## 1      628886.4
## 2      265625.4
## 3      266754.1
## 4      366580.3
## 5      229319.0

7.6.2 4 vars

clusters2 <- c("population_post", "income_post", "total_post", "zhvi_post_avg")
cluster_z2 <- scale(df_stat[, clusters2])

set.seed(612)
kmeans2 <- kmeans(cluster_z2, centers = 6)
df_stat$cluster2 <- as.factor(kmeans2$cluster)

ggplot(df_stat, aes(x = cluster2, y = zhvi_post_change, fill = cluster2)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Clustering: GDP + Income + Population",
       x = "Cluster",
       y = "Housing Price Change (%)")

cluster_nodes2 <- data.frame(State = df_stat$State, Cluster = df_stat$cluster2)
split(cluster_nodes2$State, cluster_nodes2$Cluster)
## $`1`
## [1] "AR" "IA" "NE" "ND" "OK" "SD" "WY"
## 
## $`2`
##  [1] "AZ" "DE" "FL" "GA" "MT" "NV" "NC" "SC" "TN" "TX"
## 
## $`3`
## [1] "CT" "MD" "MA" "NH" "NJ" "NY" "PA" "RI" "VT"
## 
## $`4`
## [1] "CO" "ID" "UT"
## 
## $`5`
## [1] "CA" "DC" "HI" "OR" "WA"
## 
## $`6`
##  [1] "AL" "AK" "IL" "IN" "KS" "KY" "LA" "ME" "MI" "MN" "MS" "MO" "NM" "OH" "VA"
## [16] "WV" "WI"
cluster_summary2 <- aggregate(df_stat[, clusters2], by = list(Cluster = df_stat$cluster2), FUN = mean)
cluster_summary2[ , -1] <- round(cluster_summary2[ , -1], 2)
print(cluster_summary2)
##   Cluster population_post income_post total_post zhvi_post_avg
## 1       1            2.51       17.21      12.12      216543.8
## 2       2            4.14       13.60      14.35      299750.4
## 3       3            1.91        9.70      10.57      359597.0
## 4       4            4.54       16.79      19.66      446324.5
## 5       5            1.61       10.90      15.33      593770.3
## 6       6            1.35       12.19      11.23      222252.1